import pandas as pd
df = pd.read_csv("heart.csv")
# 查看前五行
df.head()
df.shape
# 查看后五行
df.tail()
# 查看列名
df.columns
# 查看统计分布
df.describe()
# 查看数据集信息
df.info()
# 缺失值统计
df.isnull().sum()
import pandas_profiling
profile = pandas_profiling.ProfileReport(df)
# 生成数据集报告
profile
# 将报告保存到本地
profile.to_file('profile.html')
import matplotlib.pyplot as plt
import seaborn as sns
# 特征两两相关性分析
df.corr()
# 可视化热力图
plt.figure(figsize=(10,10), dpi=400)
sns.heatmap(df.corr(), annot=True, fmt='.1f', square=True) # annot是否显示数字
plt.show()
# 查看api
sns.heatmap?
# 绘制两两散点图
sns.pairplot(df)
plt.show()
# 单个特征统计分布分析
sns.distplot(df['age'])
plt.show()
df["age"].max()
df.age.max()
# 查看不同的数字
df.age.unique()
# 每个元素出现的次数
df.target.value_counts()
# 对一列的数据集统计分析
sns.countplot(x="target", data=df, palette="bwr")
plt.show()
sns.countplot(x="sex", data=df, palette="mako_r")
plt.xlabel("Sex(0 =female, 1=male)")
plt.show()
# 单列特征与标签的关系
pd.crosstab(df.age, df.target).plot(kind="bar", figsize=(20, 6))
plt.title("Hear Disease Frequency for Ages")
plt.xlabel("Age")
plt.ylabel("Frequency")
plt.savefig("heartDiseaseAndAges.png")
plt.show()
# 箱形图
sns.boxplot(x=df.target,y=df.age)
plt.show()
# 小提琴图
sns.violinplot(x=df.target, y=df.age)
plt.show()
# 散点图不同年龄段和不同最大心率的患病情况
plt.scatter(x=df.age[df.target==1], y=df.thalach[df.target==1], c="red")
plt.scatter(x=df.age[df.target==0], y=df.thalach[df.target==0], c="blue")
plt.legend(["Disease", "Not Disease"])
plt.xlabel("Age")
plt.ylabel("Maximum Heart Rate")
plt.show()
# 忽略烦人的红色提示
import warnings
warnings.filterwarnings("ignore")
# 简写列名修改为完整列名
df.columns = ['age', 'sex', 'chest_pain_type','resting_blood_pressure','cholesterol','fasting_blood_sugar','rest_ecg',
'max_hear_rate_achieced','exercise_induced_angina', 'st_depression',
'st_slope','num_major_vessels','thalassemin','target']
df.head()
# 定类特征的整数编码转换成字符串
df["sex"][df.sex == 0] = 'female'
df["sex"][df.sex == 1] = 'male'
df['chest_pain_type'][df.chest_pain_type == 0] = "typical angina"
df['chest_pain_type'][df.chest_pain_type == 1] = "atypical angina"
df['chest_pain_type'][df.chest_pain_type == 2] = "non-anginal pain"
df['chest_pain_type'][df.chest_pain_type == 3] = "asymptomatic"
df["fasting_blood_sugar"][df.fasting_blood_sugar == 0] = "lower than 120mg/ml"
df["fasting_blood_sugar"][df.fasting_blood_sugar == 1] = "greater than 120mg/ml"
df['rest_ecg'][df.rest_ecg == 0] = "normal"
df['rest_ecg'][df.rest_ecg == 1] = "ST-T wave abnormality"
df['rest_ecg'][df.rest_ecg == 2] = "left ventricular hypertrophy"
df['exercise_induced_angina'][df.exercise_induced_angina == 0] = "yes"
df['exercise_induced_angina'][df.exercise_induced_angina == 1] = "no"
df['st_slope'][df.st_slope == 0] ='upsloping'
df['st_slope'][df.st_slope == 1] ='flat'
df['st_slope'][df.st_slope == 2] ='downsloping'
df['thalassemin'][df.thalassemin == 0] = 'unknown'
df['thalassemin'][df.thalassemin == 1] = 'normal'
df['thalassemin'][df.thalassemin == 2] = 'fixed defect'
df['thalassemin'][df.thalassemin == 3] = 'reversable defect'
df.head()
# one-hot编码(对obj的进行分列)
df = pd.get_dummies(df)
df.columns
df.head()
# 导出处理好的数据集
df.to_csv("process_heart.csv", index=False)
from pdpbox import pdp, get_dataset, info_plots
fig, axed, summary_df = info_plots.target_plot(
df=df,feature='sex_male', feature_name='gender', target=['target']
)
#_ = axes['bar_ax'].set_xticklabels({"Female", 'Male'})
fig, axed, summary_df = info_plots.target_plot(
df=df,feature='age', feature_name='age', target=['target']
)
# 两两之间的关系
feat_name1 = 'age'
nick_name1 = 'age'
feat_name2 = 'max_hear_rate_achieced'
nick_name2 = 'max_hart_rate'
fig, axed, summary_df = info_plots.target_plot_interact(
df=df,features=[feat_name1, feat_name2], feature_names=[nick_name1,nick_name2], target=['target']
)
plt.show()
# 除了target列,全部取出来
X = df.drop('target', axis=1)
X.shape
Y = df.target
Y
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=10)
X.shape
X_test.shape
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(max_depth=5, n_estimators=10, random_state=5)
model.fit(X_train, Y_train)
# 指定索引为7的决策树
estimator = model.estimators_[7]
estimator
feature_names = X_train.columns
Y_train_str = Y_train.astype('str')
Y_train_str[Y_train_str == '0'] = 'no disease'
Y_train_str[Y_train_str == '1'] = 'disease'
Y_train_str = Y_train_str.values
# 决策树可视化
from sklearn.tree import export_graphviz
import os
os.environ["PATH"]+= os.pathsep + 'C:/Program Files (x86)/Graphviz2.38/bin/
export_graphviz(estimator, out_file='tree.dot',
feature_names=feature_names,
class_names=Y_train_str,
rounded=True,proportion=True,
label='root',
precision=2,filled=True)
from subprocess import call
call(['dot', '-Tpng', 'tree.dot', '-o', 'tree.png', '-Gdpi=600'])
from IPython.display import Image
Image(filename='tree.png')
# 特征重要性
model.feature_importances_
import numpy as np
print("特征排序:")
feature_names = X_test.columns
feature_importances = model.feature_importances_
indices = np.argsort(feature_importances)[::-1]
for index in indices:
print("feature %s (%f)"%(feature_names[index], feature_importances[index]))
# 各个特征的权重
import eli5
eli5.show_weights(estimator, feature_names=feature_names.to_list())
plt.figure(figsize=(16,8))
plt.title("Feature Importance")
plt.bar(range(len(feature_importances)), feature_importances[indices], color='b')
plt.xticks(range(len(feature_importances)), np.array(feature_names)[indices], color='b', rotation=90)
plt.show()
X_test.shape
X_test.head()
test_sample = X_test.iloc[2]
test_sample.shape
# 形成二维数组
test_sample = np.array(test_sample).reshape(1,-1)
test_sample.shape
# 二分类定性的分类结果
model.predict(test_sample)
# 二分类定量的分类结果
model.predict_proba(test_sample)
model.predict(X_test)
model.predict_proba(X_test)
# 患病的置信度
model.predict_proba(X_test)[:,1]
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)
from sklearn.metrics import confusion_matrix
confusion_matrix_model = confusion_matrix(Y_test, y_pred)
confusion_matrix_model
# 混淆矩阵绘制模板
import itertools
def cnf_matrix_plotter(cm, classes):
plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Oranges)
plt.title("Confusion Matrix")
plt.colorbar()
tick_marks = np.arange(len(classes))
plt.xticks(tick_marks, classes)
plt.yticks(tick_marks, classes)
threshold = cm.max()/2
for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
plt.text(j, i, cm[i, j],
horizontalalignment='center',
color='white'if cm[i,j] > threshold else "black",
fontsize=25)
plt.tight_layout()
plt.ylabel("True Label")
plt.xlabel("Predicted Label")
plt.show()
cnf_matrix_plotter(confusion_matrix_model, ['Healthy', 'Disease'])
y_pred_quant = model.predict_proba(X_test)[:,1]
from sklearn.metrics import roc_curve, auc
fpr, tpr, threshold = roc_curve(Y_test, y_pred_quant)
fpr
tpr
# 阈值
threshold
# 绘制ROC曲线
plt.plot(fpr, tpr)
plt.plot([0,1], [0,1], ls='--', c=".3")
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.rcParams['font.size'] = 12
plt.title("ROC curse")
plt.xlabel("False Positice Rate(1 - Specificity)")
plt.ylabel("True Positive Rate (Sensitivity)")
plt.grid(True)
auc(fpr, tpr)
import eli5
from eli5.sklearn import PermutationImportance
# 打乱训练来看特征重要性
perm = PermutationImportance(model, random_state=1).fit(X_test, Y_test)
eli5.show_weights(perm, feature_names=X_test.columns.tolist())
fig, axes, summary_df = info_plots.actual_plot(
model=model, X=X_train, feature='sex_male', feature_name='gender',predict_kwds={}
)
fig, axes, summary_df = info_plots.actual_plot(
model=model, X=X_train, feature='num_major_vessels', feature_name='gender',predict_kwds={}
)
feat_name = 'num_major_vessels'
nick_name = 'num_vessels'
pdp_dist = pdp.pdp_isolate(
model=model, dataset=X_test, model_features=feature_names, feature=feat_name
)
fig, axes = pdp.pdp_plot(pdp_dist, nick_name, plot_lines=True, frac_to_plot=0.8, plot_pts_dist=True)
pdp_dist = pdp.pdp_isolate(
model=model, dataset=X_test, model_features=feature_names, feature="max_hear_rate_achieced"
)
fig, axes = pdp.pdp_plot(pdp_dist, 'max_heart_rate')
for each in feature_names:
feat_name = each
pdp_dist = pdp.pdp_isolate(
model=model, dataset=X_test, model_features=feature_names, feature=feat_name
)
plt.show()
feat_name1 = "max_hear_rate_achieced"
nick_name1 = "max_hear_rate"
feat_name2 = "num_major_vessels"
nick_name2 = "num_vessels"
inter1 = pdp.pdp_interact(
model=model, dataset=X_test, model_features=feature_names, features=[feat_name1, feat_name2]
)
fig, axes = pdp.pdp_interact_plot(
pdp_interact_out=inter1, feature_names=[nick_name1, nick_name2], plot_type="contour", x_quantile=True, plot_pdp=True)
import shap
shap.initjs()
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X_test)
len(shap_values)
shap_values[0].shape
shap_values[1].shape
# 患病和不患病的平均概率
explainer.expected_value
shap.summary_plot(shap_values[1], X_test, plot_type='bar')
# 每一行表示一个特征,红色表示该特征的值较高的数据点,越靠右的电表示该特征对患病影响正相关越高
shap.summary_plot(shap_values[1], X_test)
shap.summary_plot(shap_values[1], X_test, plot_type='violin')
# 对于单个病人
idx = 126
patient = X.iloc[idx,:]
patient